In [1]:
%matplotlib inline
from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import datetime
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
In [2]:
import tabulate
In [3]:
SPLITS = ['train', 'test', 'dev']
In [4]:
df = {}
for split in SPLITS:
df[split] = pd.read_csv('../../../../data/annotations/split/' + split + '/annotations.tsv', sep = '\t', encoding = 'utf-8')
In [5]:
combined_df = pd.concat(df.values())
In [6]:
SOURCES = ['article_blocked', 'article_random', 'user_blocked', 'user_random']
In [7]:
agg_dict = {'ns': 'first', 'sample': 'first', 'src': 'first', 'recipient': 'mean',
'attack': 'mean', 'aggression': 'mean'}
grouped_df = combined_df.groupby('rev_id').agg(agg_dict)
In [8]:
grouped_source_df = {}
for source in SOURCES:
grouped_source_df[source] = grouped_df[grouped_df['src'].str.contains(source)]
grouped_source_df['total'] = grouped_df
In [9]:
number_of_revisions = {k: len(v) for k, v in grouped_source_df.items()}
In [10]:
number_of_revisions
Out[10]:
In [11]:
# Next compute proportion of aggressive and attacking revisions for each source
num = {}
perc = {}
for term in ['attack', 'aggression']:
num[term] = {}
perc[term] = {}
for source in SOURCES + ['total']:
num[term][source] = len(grouped_source_df[source].query('%s > 0.5' % term))
perc[term][source] = num[term][source]/number_of_revisions[source]
In [12]:
for term in ['attack', 'aggression']:
print(term)
print(num[term])
print(perc[term])
In [13]:
dat = combined_df
In [14]:
dat.columns
Out[14]:
In [15]:
ATTACK_COLUMNS = ['attack_bool', 'not_attack']
dat['attack_bool'] = (dat['attack'] > 0.5).apply(int)
dat['not_attack'] = 1-dat['attack_bool']
AGGRESSIVE_COLUMNS = ['aggressive_bool', 'not_aggressive']
dat['aggressive_bool'] = (dat['aggression'] > 0.5).apply(int)
dat['not_aggressive'] = 1-dat['aggressive_bool']
In [16]:
agg_dict = {'ns': 'first', 'sample': 'first', 'src': 'first', 'recipient': 'mean',
'attack': 'mean', 'aggression': 'mean'}
agg_dict.update(dict.fromkeys(ATTACK_COLUMNS, 'sum'))
agg_dict.update(dict.fromkeys(AGGRESSIVE_COLUMNS, 'sum'))
ia_df = dat.groupby('rev_id').agg(agg_dict)
In [17]:
%load_ext autoreload
%autoreload 2
from krippendorf_alpha import *
In [18]:
print('Attack: ')
print(Krippendorf_alpha(ia_df, ATTACK_COLUMNS))
print('Aggression: ')
print(Krippendorf_alpha(ia_df, AGGRESSIVE_COLUMNS))
In [ ]: